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DETERMINE A BLOCK WIDTH FROM 
THE NUMBER OF THREADS AND THE 
SIZE OF THE PROBLEM 



S2 



COPY BLOCKS (D AND Li) TO BE 
PROCESSED BY THREADS 
(PROCESSORS) EACH DETERMINING A 

BLOCK TO BE PROCESSED TO A 
WORKING AREA 



r 



LET EACH THREAD TAKE IN PIVOTS. 

IDENTIFY A LARGEST PIVOT AND 
TRANSPOSE A ROW VECTOR BY USING A 

SHARED AREA. LET EACH THREAD 
PERFORM LU FACTORIZATION ON D + Li. 




LET THE THREADS UPDATE Ui BY 
USING LL IN PARALLEL 



LET THE THREADS UPDATE Ci WITH 
A PRODUCT OF Li AND U IN 
PARALLEL 



Q END J 
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DO i = 1, iblks 

TMP=0.0 DO;jj=0 
DO j = i, leng 

IFCABS LT(j, 0), GT , TMP)THEN 

TMP=ABS(LT(j. 0) 

jj=j 
ENDIF 
ENDDO 



IF(jj, GT, i) THEN 

DO k=1 , iblks 

TMPX=LT(i, k) 

LT(i, k)=LT(jj, k) 

LTCjj, k)=TMPX 
ENDDO 
END IF 

DO k=i + 1. iblks 

LT(i, k)=LT(i, k)/LT(i, i) 
ENDDO 

DO k=i+1, iblks 
DO l=i + 1 , leng 

LT(I, k)=LT(l, k)-LT(l, i) xLT(i, k) 
ENDDO 
ENDDO 

ENDDO 
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DO i=1, iblks 

TMP=0,0 DO;jj=0 
DO j=1, lengi 

IFCABS LTi Q, i)), GT , TMP)THEN 

TMP=ABS(LTi(j, 0) (4) 
jj=i 

ENDIF _ 
ENDDO 

pivpot(#THREAD) =jj 

(#THREAD IS A THREAD NUMBER. IN THE 

CASE OF PARALLEL PROCESSING BY 4 (s) 

THREADS, #THREAD IS PRESCRIBED AS 

1,2,3 AND 4.) _ 

BARRIER SYNCHRONIZATION 
iF(#THREAD, EQ. 1) 

jx=0;GPIVOT=0 
DO ix=1. 4 

IFCpivot(ix), GT, jx. AND, PIVOT(ix). GT. iblks) GPIVOT=ix 
(THE NUMBER OF A THREAD HAVING A LARGEST NUMBER) 

ENDDO 
END IF 

BARRIER SYNCHRONIZATION 

IF(#THREAD, EQ, GPIVOT)THEN 

IFCjj, GT, OTHEN 

DO ix=1, iblks 

ROW(ix)=LTi(jj, ix) 
ENDDO 
END IF 

BARRIER SYNCHRONIZATION 
IF (G PIVOT, EQ, 0)THEN 
IFCjj, GT. i)THEN 
DO i = 1, iblks, 
TMPW=LTi(i, ix) 
LTi (i, ix)=LTi(jj, ix) 
LTiCjj, ix)=TMPW 
ENDDO 
END IF 
ELSE 

IF(#THREAD, EQ, GPIVOT)THEN 
DO ix=1, iblks 

LTKjj, ix)=LTiCi, ix) 
LTi(i, ix)=ROW(ix) 
ENDDO 
ELSE 

DO ix=1, iblks 
LTi(i. ix)=ROW(ix) 
ENDDO 
ENDIF 



f SINGE TRASPOSITION HAS"^ 

M BEEN CARRIED OUT IN AN IP, 

THE THREADS EXECUTE THE 
^PROCESSING IN PARALLEL^ 



DO k=i + 1, iblks, 

LTi(i, k) = LTi(i, k)/LT(i, i) 
ENDDO 

DO k=i + 1. iblks 
DO l=i+1, lengi 

LTi (I, k)=LTiCI. k)-LTi(l, i) xLTiCi, k) 
ENDDO 
ENDDO 
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ENDDO 







256 

/ 

384 

V 

r 

384 

/ 

384 

f 

384 

V 


D1 


Ut U2 U3 U4 


LI 


C 1 


L2 


C2 


L3 


C3 


L4 


C4 

, 



FIG. 9 



subroutine LU (LTi, k, iblks, ist, nwid) 

(WHERE LTi IS USED BY THREADS FOR STORING (D1 +Li), 
k IS THE SIZE OF THE FIRST ONE DIMENSION OF LTi, 
iblks IS THE BLOCK WIDTH, 

ist IS A POSITION TO START THE Lu FACTORIZATION AND 
nwid IS THE WIDTH OF AN OBJECT SUBJECTED TO THE Lu FACTORIZATION) 



IF(nwid, eq, «) ThAnf A WIDTH OF 8 IS A MINIMUM ). 



LTi(ist:k, ist, ist+nwid-1) IS SUBJECTED TO THE LU FACTORIZATION IN 
PARALLEL 

f~ HERE, THE PARTS (4) TO (1 0) OF FIG.9 ARE EXECUTED. 

IN THIS CASE, THE ROW-TRANSPOSING UNIT TRANSPOSES 
L_ LTi(i. 1 . iblks) AT THE LENGTH iblk. _J 



else 

call LU(LTi, k, iblks, ist, nwid/2) 
call TRS( ) 

UPDATE LTi (ist: ist+nwid/2 — 1 . ist+nwid/2: ist+nwid). BY USING A 
LOWER-TRIANGULAR MATRIX LL OF LTi (ist: ist+nwid/2— 1 , ist: ist+nwid/2 
- 1 ) , UPDATE IT BY MULTIPLYING IT BY LL + FROM THE LEFT. 

call MM( ) 

LTi (ist+ nwid/2 :k, ist+nwid/2 : ist+nwid) 
= LTi (ist+ nwid/2 : k, ist+ nwid/2 : ist+ nwid) 
-LTi(ist+nwid/2:k, ist: ist+nwid/2- 1 ) x 
V_ LTi (ist: ist+ nwid/2— 1 , ist+nwid/2: ist+nwid) -J 

Barrier SYNCHRONIZATION 
call LU (LTi, k, iblks, ist+nwid/2, nwid/2 
end if 
return 

end subroutine 




FIG. 10 




FIG. 11 



D 




D 




D 


L 1 




L2 




L3 




FIG. 


12 































1 



\ 











































• t 

10 











-4- 




-4- 


I i. 


"CO 

to 

£ t 











-H 




ID 

to 






FIG. 14 



Dt.21 




FIG. 15 



subroutine l_TD(l_Ti, k, iblks, ist, nwid) 
IF(nwid, EQ, 8)THEN (THE WIDTH OF 8 IS THE MINIMUM) 
DOi = ist, ist+7 \ 
DOj = i + 1, ist+7 
LTi(i, j)=LTi(j, i) 
LTi(j, i)=l_Ti(j, i)/LTi(i, i) 
ENDDO (20) 
DO jy=i + 1, ist+7 
DO jx=jx, ist+7 

LTi(jx, jy)=LTi(jx, jy)-LTi(jx, i) XLTi(i, jy) 
ENDDO / 
ENDDO J 

C UPDATE LTi(LTi(ist+8;k, ist:ist+7). 
SINCE DL T IS INCLUDED IN THE UPPER TRIANGLE OF I 
LTKLTi (ist: ist+7, ist:ist+7), UPDATE (PL-V FROM THE RIGHT. ^ 



call LDL(LTi, k, iblks, ist, nwid/2) 
COPY DL T TO 

■ LTi ( ist: ist+ nwid>/2— 1 . ist+ nw id/2 : ist+ nwid - 1) . 

(D IS AN OBJECT ELEMENT OF LTi(ist: ist+nwid/2— 1 , ist: ist+nwid/2— 1 ) 
AND L IS 

LTi (ist+nwid/2: ist+nwid- 1 , ist: ist+nwid/2- 1 ) , 
TRANSPOSING THIS L T .) 



■UPDATE LTi(ist+nwid/2:k, ist+nwid/2 : ist+nwid — 1). 

LTi(ist+nwid/2:k, ist+nwid/2: ist+nwid— 1 ) 
= LTi(ist:ist+nwid/'2:k, ist+nwid/2: ist+nwid — 1 ) — 
LTi(ist+nwid/2:k, ist: ist+nwid-1 ) x 
LTi(ist:ist+nwid/2 — 1, ist+nwid/2; ist+nwid— 1 ) 

CALL LDL (LTi, k, iblks, ist+nwid/2, nwid/2) 

ENDIF 

RETURN 

END 
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